使用 python 提取 pdf 中 table 中包含的文本的最佳方法是什么？

Question

我正在构建一个程序来从 pdf 中提取文本，将其放入结构化格式，然后将其发送到数据库。我有大约 1,400 个单独的 pdf，它们都遵循类似的格式，但文档总结的措辞和计划设计中的细微差别使它变得棘手。

我在 python 中玩过几个不同的 pdf 阅读器，包括 tabula-py 和 pdfminer，但其中 none 非常适合我想做的事情。 Tabula 可以很好地阅读所有文本，但是它会拉出所有内容，因为它明确地水平放置，不包括一些文本被包裹在一个盒子里的事实。例如，如果您打开我附加的示例 SBC，它显示 "What is the overall deductible?" Tabula 将读入 "What is the overall 0/Individual or..."，跳过 "deductible" 这个词实际上是第一句的一部分这一事实。（请注意，我正在使用的文件是 pdf，但我附上了 jpeg，因为我不知道如何附上 pdf。）

import tabula

df = tabula.read_pdf(*filepath*, pandas_options={'header': None))

print(df.iloc[0][0])
print(df)

最后，我真的很想能够解析出每个框中的文本，以便我可以更好地识别哪些值属于免赔额、自付费用限额，copays/coinsurance，等。我认为可能某种 OCR 可以让我识别 PDF 的哪些部分包含在蓝色矩形中，然后从那里拉出字符串，但我真的不知道从哪里开始。Sample SBC

Answer 1

我认为完成所需操作的最佳方法是查找并隔离文件中的单元格，然后将 OCR 应用于每个单独的单元格。

SO 中有许多解决方案，我从获得了代码并稍微调整了一些参数以获得以下输出（尚不完美，但您可以对其进行调整一点点自己）。

import os
import cv2
import imutils

# This only works if there's only one table on a page
# Important parameters:
#  - morph_size
#  - min_text_height_limit
#  - max_text_height_limit
#  - cell_threshold
#  - min_columns


def pre_process_image(img, save_in_file, morph_size=(23, 23)):

    # get rid of the color
    pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Otsu threshold
    pre = cv2.threshold(pre, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    # dilate the text to make it solid spot
    cpy = pre.copy()
    struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
    cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
    pre = ~cpy

    if save_in_file is not None:
        cv2.imwrite(save_in_file, pre)
    return pre


def find_text_boxes(pre, min_text_height_limit=20, max_text_height_limit=120):
    # Looking for the text spots contours
    contours, _ = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

    # Getting the texts bounding boxes based on the text size assumptions
    boxes = []
    for contour in contours:
        box = cv2.boundingRect(contour)
        h = box[3]

        if min_text_height_limit < h < max_text_height_limit:
            boxes.append(box)

    return boxes


def find_table_in_boxes(boxes, cell_threshold=100, min_columns=3):
    rows = {}
    cols = {}

    # Clustering the bounding boxes by their positions
    for box in boxes:
        (x, y, w, h) = box
        col_key = x // cell_threshold
        row_key = y // cell_threshold
        cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
        rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]

    # Filtering out the clusters having less than 2 cols
    table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
    # Sorting the row cells by x coord
    table_cells = [list(sorted(tb)) for tb in table_cells]
    # Sorting rows by the y coord
    table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))

    return table_cells


def build_lines(table_cells):
    if table_cells is None or len(table_cells) <= 0:
        return [], []

    max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
    max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]

    max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
    max_y = max_last_row_height_box[1] + max_last_row_height_box[3]

    hor_lines = []
    ver_lines = []

    for box in table_cells:
        x = box[0][0]
        y = box[0][1]
        hor_lines.append((x, y, max_x, y))

    for box in table_cells[0]:
        x = box[0]
        y = box[1]
        ver_lines.append((x, y, x, max_y))

    (x, y, w, h) = table_cells[0][-1]
    ver_lines.append((max_x, y, max_x, max_y))
    (x, y, w, h) = table_cells[0][0]
    hor_lines.append((x, max_y, max_x, max_y))

    return hor_lines, ver_lines


if __name__ == "__main__":
    in_file = os.path.join(".", "test.jpg")
    pre_file = os.path.join(".", "pre.png")
    out_file = os.path.join(".", "out.png")

    img = cv2.imread(os.path.join(in_file))

    pre_processed = pre_process_image(img, pre_file)
    text_boxes = find_text_boxes(pre_processed)
    cells = find_table_in_boxes(text_boxes)
    hor_lines, ver_lines = build_lines(cells)

    # Visualize the result
    vis = img.copy()

    # for box in text_boxes:
    #     (x, y, w, h) = box
    #     cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)

    for line in hor_lines:
        [x1, y1, x2, y2] = line
        cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

    for line in ver_lines:
        [x1, y1, x2, y2] = line
        cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

    cv2.imwrite(out_file, vis)

Answer 2

@jpnadas 在这种情况下，您从我在的答案中复制的代码并不是真正的 suitable 因为它解决了 table 没有周围的情况网格。该算法寻找重复的文本块，并尝试以启发式方式找到类似于 table 的模式。

但在这种特殊情况下，table 确实有网格，利用这一优势，我们可以获得更准确的结果。

策略如下：

增加图像 Gamma 使网格更暗
去除颜色并应用 Otsu 阈值处理
在图像中找到长的垂直线和水平线，并使用 erode 和 dilate 函数从中创建蒙版
使用 findContours 函数在掩码中查找单元块。
找到 table 个对象

5.1 剩下的可以和post一样找一个table不用网格：启发式查找table结构

5.2 替代方法可以使用 findContours 函数返回的 hierarchy。这种方法更加准确和允许在单个图像上查找多个 table。
有了细胞坐标就很容易从原始图像中提取特定的细胞图像：

cell_image = image[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
对每个cell_image应用OCR。

但是！当您无法读取 PDF 的内容时，我认为 OpenCV 方法是最后的手段：例如，当 PDF 中包含光栅图像时。

如果它是基于矢量的 PDF 并且其内容是可读的，那么找到 table 内部内容并从中读取文本而不是做繁重的 'OCR lifting' 更有意义。

为了更准确table识别，参考代码如下：

import os
import imutils
import numpy as np
import argparse
import cv2


def gamma_correction(image, gamma = 1.0):
    look_up_table = np.empty((1,256), np.uint8)

    for i in range(256):
        look_up_table[0,i] = np.clip(pow(i / 255.0, gamma) * 255.0, 0, 255)

    result = cv2.LUT(image, look_up_table)

    return result


def pre_process_image(image):
    # Let's get rid of color first

    # Applying gamma to make the table lines darker
    gamma = gamma_correction(image, 2)

    # Getting rid of color
    gray = cv2.cvtColor(gamma, cv2.COLOR_BGR2GRAY)

    # Then apply Otsu threshold to reveal important areas
    ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # inverting the thresholded image
    return ~thresh


def get_horizontal_lines_mask(image, horizontal_size=100):

    horizontal = image.copy()
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)
    horizontal = cv2.dilate(horizontal, horizontal_structure, anchor=(-1, -1), iterations=1)

    return horizontal


def get_vertical_lines_mask(image, vertical_size=100):
    vertical = image.copy()
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure, anchor=(-1, -1), iterations=1)
    vertical = cv2.dilate(vertical, vertical_structure, anchor=(-1, -1), iterations=1)

    return vertical


def make_lines_mask(preprocessed, min_horizontal_line_size=100, min_vertical_line_size=100):

    hor = get_horizontal_lines_mask(preprocessed, min_horizontal_line_size)
    ver = get_vertical_lines_mask(preprocessed, min_vertical_line_size)

    mask = np.zeros((preprocessed.shape[0], preprocessed.shape[1], 1), dtype=np.uint8)
    mask = cv2.bitwise_or(mask, hor)
    mask = cv2.bitwise_or(mask, ver)

    return ~mask


def find_cell_boxes(mask):
    # Looking for the text spots contours
    # OpenCV 3
    # img, contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    # OpenCV 4
    contours = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = imutils.grab_contours(contours)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)

    image_width = mask.shape[1]

    # Getting the texts bounding boxes based on the text size assumptions
    boxes = []
    for contour in contours:
        box = cv2.boundingRect(contour)
        w = box[2]

        # Excluding the page box shape but adding smaller boxes
        if w < 0.95 * image_width:
            boxes.append(box)

    return boxes


def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
    rows = {}
    cols = {}

    # Clustering the bounding boxes by their positions
    for box in boxes:
        (x, y, w, h) = box
        col_key = x // cell_threshold
        row_key = y // cell_threshold
        cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
        rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]

    # Filtering out the clusters having less than 2 cols
    table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
    # Sorting the row cells by x coord
    table_cells = [list(sorted(tb)) for tb in table_cells]
    # Sorting rows by the y coord
    table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))

    return table_cells


def build_vertical_lines(table_cells):
    if table_cells is None or len(table_cells) <= 0:
        return [], []

    max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
    max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]

    max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
    max_y = max_last_row_height_box[1] + max_last_row_height_box[3]

    hor_lines = []
    ver_lines = []

    for box in table_cells:
        x = box[0][0]
        y = box[0][1]
        hor_lines.append((x, y, max_x, y))

    for box in table_cells[0]:
        x = box[0]
        y = box[1]
        ver_lines.append((x, y, x, max_y))

    (x, y, w, h) = table_cells[0][-1]
    ver_lines.append((max_x, y, max_x, max_y))
    (x, y, w, h) = table_cells[0][0]
    hor_lines.append((x, max_y, max_x, max_y))

    return hor_lines, ver_lines


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("-i", "--image", required=True, help="path to images directory")
    args = vars(ap.parse_args())

    in_file = args["image"]
    filename_base = in_file.replace(os.path.splitext(in_file)[1], "")

    img = cv2.imread(in_file)

    pre_processed = pre_process_image(img)

    # Visualizing pre-processed image
    cv2.imwrite(filename_base + ".pre.png", pre_processed)

    lines_mask = make_lines_mask(pre_processed, min_horizontal_line_size=1800, min_vertical_line_size=500)

    # Visualizing table lines mask
    cv2.imwrite(filename_base + ".mask.png", lines_mask)

    cell_boxes = find_cell_boxes(lines_mask)

    cells = find_table_in_boxes(cell_boxes)

    # apply OCR to each cell rect here
    # the cells array contains cell coordinates in tuples (x, y, w, h)

    hor_lines, ver_lines = build_vertical_lines(cells)

    # Visualize the table lines
    vis = img.copy()

    for line in hor_lines:
        [x1, y1, x2, y2] = line
        cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

    for line in ver_lines:
        [x1, y1, x2, y2] = line
        cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

    cv2.imwrite(filename_base + ".result.png", vis)

一些参数是硬编码的：

页面大小阈值 - 0.95
最小水平线大小 - 1800 像素
最小垂直线大小 - 500 像素

您可以将它们作为可配置参数提供，或者使它们与图像大小相关。

结果：

使用 python 提取 pdf 中 table 中包含的文本的最佳方法是什么？

What is the best way to extract text contained within a table in a pdf using python?

pdf

ocr

pdf-reader

python-3.x